"""
create sim_score_1981_2015_top5_noselfcite.csv, which contains extra columns: 
no_overlap_inventors, no_overlap_lead_inventor

Inputs:
    cleandata/sim_score_1981_2015_top5.csv
    rawdata/g_inventor_disambiguated.tsv (PatentsView)
Outputs:
    cleandata/sim_score_1981_2015_top5_noselfcite.csv
"""

import pandas as pd
import numpy as np
import tqdm
import os

os.chdir(r'/Volumes/Zihao_SSD2/PatentsView/')

# patent_id is int64
df_inventor = pd.read_table("rawdata/g_inventor_disambiguated.tsv", sep="\t", usecols=["patent_id", "inventor_sequence", "inventor_id"])
print(df_inventor.shape) # (20427566, 3)
# drop duplicates
if df_inventor.duplicated().any():
    df_inventor.drop_duplicates(inplace=True)
# create a numeric ID for optimizing space
df_inventor["inventor_id_num"] = df_inventor.groupby(["inventor_id"]).ngroup()

# load top5 sim_score data
df_top = pd.read_csv("cleandata/sim_score_1981_2015_top5.csv")
print(df_top.shape) # (24766255, 7)

citing_patent_set = set(df_top["patent_id"])
cited_patent_set = set(df_top["cited_patent_id"])

df_inventor_citing = df_inventor[df_inventor["patent_id"].isin(citing_patent_set)]
df_inventor_cited = df_inventor[df_inventor["patent_id"].isin(cited_patent_set)]

df_top_screened = df_top[
    (df_top["patent_id"].isin(set(df_inventor_citing["patent_id"])))
    & (df_top["cited_patent_id"].isin(set(df_inventor_cited["patent_id"])))
]

print(f"df_top nrows = {len(df_top)}; df_top_screened nrows = {len(df_top_screened)}") # 24766255; 24757829
print(f"number of rows (citing-cited pairs) with missing inventor information: {len(df_top) - len(df_top_screened)}") # 8426

# citing/cited patent dictionaries mapping patent ID to (inventor_id set)
hashmap_citing = (
    df_inventor_citing.groupby(["patent_id"])["inventor_id_num"]
    .unique()
    .apply(lambda x: set(x))
    .to_dict()
)
hashmap_cited = (
    df_inventor_cited.groupby(["patent_id"])["inventor_id_num"]
    .unique()
    .apply(lambda x: set(x))
    .to_dict()
)


keep = []
for row in tqdm.tqdm(df_top_screened.itertuples()):
    # if citing and cited patents have no overlap inventors (no overlap = True)
    keep.append(
        hashmap_citing[row.patent_id].isdisjoint(hashmap_cited[row.cited_patent_id])
    )
df_top_screened.loc[:, "no_overlap_inventors"] = keep

assert df_top[["patent_id", "cited_patent_id"]].duplicated().any() == False
assert df_top_screened[["patent_id", "cited_patent_id", "no_overlap_inventors"]].duplicated().any()== False

df_lead_inventor = df_inventor[df_inventor["inventor_sequence"] == 0]
assert df_lead_inventor.duplicated().any() == False

del df_top

lead_inventor_dict = dict(
    zip(df_lead_inventor["patent_id"], df_lead_inventor["inventor_id_num"])
)

df_top_screened["lead_inventor_x"] = df_top_screened["patent_id"].map(
    lead_inventor_dict
)
df_top_screened["lead_inventor_y"] = df_top_screened["cited_patent_id"].map(
    lead_inventor_dict
)
df_top_screened["no_overlap_lead_inventor"] = (
    df_top_screened["lead_inventor_x"] != df_top_screened["lead_inventor_y"]
)

print(df_top_screened.shape) # (24757829, 11)
print('Exporting dataset...')
df_top_screened.to_csv("cleandata/sim_score_1981_2015_top5_noselfcite.csv", index=False)
